#################################################################################
# Load raw data, annotate probes using biomaRt and load SFARI genes
# Load csvs
datExpr = read.csv('./../raw_data/RNAseq_ASD_datExpr.csv', row.names=1)
datMeta = read.csv('./../raw_data/RNAseq_ASD_datMeta.csv')
SFARI_genes = read_csv('./../working_data/SFARI_genes_01-15-2019.csv')
# Make sure datExpr and datMeta columns/rows match
rownames(datMeta) = paste0('X', datMeta$Dissected_Sample_ID)
if(!all(colnames(datExpr) == rownames(datMeta))){
print('Columns in datExpr don\'t match the rows in datMeta!')
}
# Make data transformation in datMeta
rownames(datMeta) = datMeta$Dissected_Sample_ID
datMeta$Dx = factor(datMeta$Diagnosis_, levels=c('CTL', 'ASD'))
datMeta$Sex = as.factor(datMeta$Sex)
datMeta$Brain_Bank = as.factor(datMeta$Brain_Bank)
datMeta$Brain_Region = as.factor(datMeta$Region)
datMeta$Brain_lobe = 'Occipital'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA4_6', 'BA9', 'BA24', 'BA44_45')] = 'Frontal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA3_1_2_5', 'BA7')] = 'Parietal'
datMeta$Brain_lobe[datMeta$Brain_Region %in% c('BA38', 'BA39_40', 'BA20_37', 'BA41_42_22')] = 'Temporal'
datMeta$Brain_lobe=factor(datMeta$Brain_lobe, levels=c('Frontal', 'Temporal', 'Parietal', 'Occipital'))
datMeta$RIN[is.na(datMeta$RIN)] = mean(datMeta$RIN, na.rm=T)
#################################################################################
# FILTERS:
# 1 Filter probes without a regular ensemble ID (filter 5)
to_keep = datExpr %>% rownames %>% startsWith('ENS')
datExpr = datExpr[to_keep,]
# 2. Filter samples from ID AN03345 (filter 2)
to_keep = (datMeta$Subject_ID != 'AN03345')
datMeta = datMeta[to_keep,]
datExpr = datExpr[,to_keep]
# 3. Filter genes with zeros in all their entries (filter 13795)
to_keep = rowSums(datExpr)>0
datExpr = datExpr[to_keep,]
#################################################################################
# Annotate SFARI genes
# Get ensemble IDS for SFARI genes
mart = useMart(biomart='ENSEMBL_MART_ENSEMBL',
dataset='hsapiens_gene_ensembl',
host='feb2014.archive.ensembl.org') ## Gencode v19
gene_names = getBM(attributes=c('ensembl_gene_id', 'hgnc_symbol'), filters=c('hgnc_symbol'),
values=SFARI_genes$`gene-symbol`, mart=mart) %>%
mutate('gene-symbol'=hgnc_symbol, 'ID'=as.character(ensembl_gene_id)) %>%
dplyr::select('ID', 'gene-symbol')
SFARI_genes = left_join(SFARI_genes, gene_names, by='gene-symbol') %>%
distinct(ID, .keep_all = TRUE)
#################################################################################
# Add functional annotation to genes from GO
GO_annotations = read.csv('./../working_data/genes_GO_annotations.csv')
GO_neuronal = GO_annotations %>% filter(grepl('neuron', go_term)) %>%
mutate('ID' = as.character(ensembl_gene_id)) %>%
dplyr::select(-ensembl_gene_id) %>% distinct(ID) %>%
mutate('Neural' = 1)
#################################################################################
# Load genes DE info
genes_DE_info = read.csv('./../working_data/genes_ASD_DE_info_raw.csv')
genes_DE_info = genes_DE_info %>% filter(ID %in% rownames(datExpr)) %>%
mutate(gene.score=ifelse(is.na(gene.score), 'None', gene.score)) %>%
distinct(ID, .keep_all = TRUE) %>% left_join(GO_neuronal, by='ID') %>%
mutate(Neural=ifelse(is.na(Neural), 0, Neural)) %>%
mutate(gene.score=ifelse(gene.score=='None' & Neural==1, 'Neural', gene.score))
rm(to_keep, mart, gene_names, GO_annotations)
Number of genes:
nrow(datExpr)
## [1] 49882
Gene count by SFARI score:
table(genes_DE_info$gene.score)
##
## 1 2 3 4 5 6 Neural None
## 24 60 189 446 172 24 1017 47950
GO Annotations:
print(glue(nrow(GO_neuronal$ID), ' genes have neuronal-related annotations.'))
print(glue(sum(SFARI_genes$ID %in% GO_neuronal$ID),' of these genes have a SFARI score.'))
## 206 of these genes have a SFARI score.
table(genes_DE_info$Neural[genes_DE_info$gene.score %in% c('1','2','3','4','5','6')],
genes_DE_info$gene.score[genes_DE_info$gene.score %in% c('1','2','3','4','5','6')])
##
## 1 2 3 4 5 6
## 0 15 44 148 371 133 20
## 1 9 16 41 75 39 4
Logarithmic transformation of the data seems the invert the behaviour of genes by SFARI score
p1 = ggplotly(genes_DE_info %>% ggplot(aes(x=gene.score, y=abs(logFC), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) +
theme_minimal() + theme(legend.position = 'none'))
genes_DE_info_log2 = read.csv('./../working_data/genes_ASD_DE_info_raw_log2.csv')
genes_DE_info_log2 = genes_DE_info_log2 %>% filter(ID %in% rownames(datExpr)) %>%
mutate(gene.score=ifelse(is.na(gene.score), 'None', gene.score)) %>%
distinct(ID, .keep_all = TRUE) %>%
mutate(gene.score=ifelse(gene.score=='None' & ID %in% GO_neuronal$ID, 'Neural', gene.score))
p2 = ggplotly(genes_DE_info_log2 %>% ggplot(aes(x=gene.score, y=abs(logFC), fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) +
theme_minimal() + theme(legend.position = 'none') +
ggtitle('logFC before and after log2 transformation'))
subplot(p1, p2, nrows=1)
rm(p1, p2)
Higher gene expression values are more affected by logarithmic transformation, and since genes with score 1 have the highest expressions, they are the most affected by the transformation.
Also, there seems to be a relation between gene expression levels and their variation.
m = data.frame('ID' = rownames(datExpr), 'gene_mean' = apply(datExpr, 1, mean))
m_scores = genes_DE_info %>% dplyr::select(gene.score, ID) %>% right_join(m, by='ID')
p1 = ggplotly(m_scores %>% ggplot(aes(gene.score, gene_mean, fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) +
theme_minimal() + theme(legend.position = 'none'))
v = data.frame('ID' = rownames(datExpr), 'gene_sd'=apply(datExpr, 1, sd))
v_scores = genes_DE_info %>% dplyr::select(gene.score, ID) %>% right_join(v, by='ID')
p2 = ggplotly(v_scores %>% ggplot(aes(gene.score, gene_sd, fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) +
theme_minimal() + theme(legend.position = 'none') +
ggtitle('Genes\' mean (left) and variance (right) by SFARI score for raw data'))
subplot(p1, p2, nrows=1)
rm(p1, p2, v, m_scores)
There is heterocedasticity in the data! Score 1 genes’ high variance could be related more to the fact that the gene expression is high than to autism-related effects
h = m %>% left_join(v_scores, by='ID')
ggplotly(h %>% ggplot(aes(x=gene_mean, y=gene_sd, fill=gene.score, color=gene.score)) +
geom_point(alpha=0.3) + scale_fill_manual(values=gg_colour_hue(8)) +
scale_color_manual(values=gg_colour_hue(8)) + scale_x_log10() + scale_y_log10() +
theme_minimal())
Seems like the variance was actually due to the heterocedasticity of the data more than ASD-related because after the transformation it became smaller than the others
datExpr_log2 = log2(datExpr+1)
m = data.frame('ID' = rownames(datExpr_log2), 'gene_mean' = apply(datExpr_log2, 1, mean))
m_scores = genes_DE_info %>% dplyr::select(gene.score, ID) %>% right_join(m, by='ID')
p1 = ggplotly(m_scores %>% ggplot(aes(gene.score, gene_mean, fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) +
theme_minimal() + theme(legend.position = 'none'))
v = data.frame('ID' = rownames(datExpr_log2), 'gene_sd' = apply(datExpr_log2, 1, sd))
v_scores = genes_DE_info %>% dplyr::select(gene.score, ID) %>% right_join(v, by='ID')
p2 = ggplotly(v_scores %>% ggplot(aes(gene.score, gene_sd, fill=gene.score)) +
geom_boxplot() + scale_fill_manual(values=gg_colour_hue(8)) +
theme_minimal() + theme(legend.position = 'none') +
ggtitle('Genes\' mean (left) and variance (right) by SFARI score for log2 data'))
subplot(p1, p2, nrows=1)
rm(p1, p2, v, m_scores)